## packages
library(tidyverse)
library(tidytext)
library(sf)
library(maps)
library(albersusa)
library(rgeocodio)
library(geojsonio)
library(rgeos)
library(ggwordcloud)
library(patchwork)
library(pdftools)
library(showtext)

## save plots?
save <- TRUE
#save <- FALSE

## quality of png's
dpi <- 750

## font
font_add_google("Montserrat", "Montserrat")
font_add_google("Overpass", "Overpass")
font_add_google("Overpass Mono", "Overpass Mono")

## theme updates
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))

theme_update(plot.margin = margin(30, 30, 30, 30),
             plot.background = element_rect(color = "white",
                                            fill = "white"),
             plot.title = element_text(size = 16,
                                       face = "bold",
                                       lineheight = 1.15,
                                       hjust = .5,
                                       margin = margin(10, 0, 25, 0)),
             #plot.title.position = "plot",
             plot.caption = element_text(color = "grey40", 
                                         size = 9,
                                         margin = margin(20, 0, -20, 0)),
             plot.caption.position = "plot",
             axis.line.x = element_line(color = "black",
                                        size = .8),
             axis.line.y = element_line(color = "black",
                                        size = .8),
             axis.title.x = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(t = 20)),
             axis.title.y = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(r = 20)),
             axis.text = element_text(size = 11,
                                      color = "black",
                                      face = "bold"),
             axis.text.x = element_text(margin = margin(t = 10)),
             axis.text.y = element_text(margin = margin(r = 10)),
             axis.ticks = element_blank(),
             panel.grid.major.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.major.y = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.y = element_blank(),
             panel.spacing.x = unit(4, "lines"),
             panel.spacing.y = unit(2, "lines"),
             legend.position = "top",
             legend.title = element_text(family = "Montserrat",
                                         color = "black",
                                         size = 14,
                                         margin = margin(5, 0, 5, 0)),
             legend.text = element_text(family = "Montserrat",
                                        color = "black",
                                        size = 11,
                                        margin = margin(4.5, 4.5, 4.5, 4.5)),
             legend.background = element_rect(fill = NA,
                                              color = NA),
             legend.key = element_rect(color = NA, fill = NA),
             #legend.key.width = unit(5, "lines"),
             #legend.spacing.x = unit(.05, "pt"),
             #legend.spacing.y = unit(.55, "pt"),
             #legend.margin = margin(0, 0, 10, 0),
             strip.text = element_text(face = "bold",
                                       margin = margin(b = 10)))

## theme settings for flipped plots
theme_flip <-
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_line(size = .6,
                                          color = "#eaeaea"))

## theme settings for maps
theme_map <- 
  theme_void(base_family = "Montserrat") +
  theme(legend.direction = "horizontal",
        legend.box = "horizontal",
        legend.margin = margin(10, 10, 10, 10),
        legend.title = element_text(size = 17, 
                                    face = "bold"),
        legend.text = element_text(color = "grey33",
                                   size = 12),
        plot.margin = margin(15, 5, 15, 5),
        plot.title = element_text(face = "bold",
                                  size = 20,
                                  hjust = .5,
                                  margin = margin(30, 0, 10, 0)),
        plot.subtitle = element_text(face = "bold",
                                     color = "grey33",
                                     size = 17,
                                     hjust = .5,
                                     margin = margin(10, 0, -30, 0)),
        plot.caption = element_text(size = 14,
                                    color = "grey33",
                                    hjust = .97,
                                    margin = margin(-30, 0, 0, 0)))

## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)

## main color backlinko
bl_col <- "#00d188"

## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")

## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", colorspace::darken(bl_col, .1))

## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)

0. Data

## prepare GlassDoord data for joining
df_gd_join <-
  df_gd %>% 
  mutate(
    employment_type = case_when(
      str_detect(description, "ulltime|ull-time|ull-Time") ~ "Full-time",
      str_detect(description, "arttime|art-time|art-Time") ~ "Part-time",
      TRUE ~ NA_character_
    ),
    seniority = case_when(
      str_detect(description, "nternship") ~ "Internship",
      str_detect(job_title, "intern|Intern") ~ "Internship",
      str_detect(description, "junior|Junior") ~ "Junior",
      str_detect(job_title, "junior|Junior") ~ "Junior",
      str_detect(description, "senior|Senior") ~ "Senior",
      str_detect(job_title, "senior|Senior") ~ "Senior",
      str_detect(description, "ntry level|ntry Level") ~ "Entry level",
      str_detect(job_title, "id-Senior|id-senior|id Senior|id senior") ~ "Mid-Senior level",
      str_detect(job_title, "irector") ~ "Director",
      str_detect(job_title, "xecutive") ~ "Executive",
      TRUE ~ NA_character_
    )
  ) %>% 
  dplyr::select(
    job_title, 
    employer, 
    location,
    size,
    description,
    seniority,
    employment_type,
    industry,
    sector
  )

###################################################
## all job offers worldwide for global map
df_world <- 
  df_li %>% 
  dplyr::select(
    job_title, 
    employer, 
    location,
    size,
    description,
    seniority,
    employment_type,
    industry,
    "sector" = job_functions
  ) %>% 
  full_join(df_gd_join) %>% 
  mutate(
    description = str_replace_all(description, "\\r", " "),
    description = str_replace_all(description, "\\n", " "),
    description = str_replace_all(description, "\\s+", " ")
  ) %>% 
  distinct(job_title, employer, location, description, .keep_all = TRUE)

###################################################
## only keep pages with language == EN
df_li_en <-
  df_li %>% 
  ## filter based on 
  filter(
    str_detect(url, "^https://www.") | 
    str_detect(url, "^https://au.") | 
    #str_detect(url, "^https://be.") | 
    str_detect(url, "^https://ca.") | 
    #str_detect(url, "^https://gh.") | 
    #str_detect(url, "^https://gr.") | 
    str_detect(url, "^https://ie.") | 
    #str_detect(url, "^https://il.") | 
    #str_detect(url, "^https://in.") | 
    #str_detect(url, "^https://mg.") | 
    #str_detect(url, "^https://ng.") | 
    #str_detect(url, "^https://ph.") | 
    #str_detect(url, "^https://sg.") | 
    #str_detect(url, "^https://ua.") | 
    #str_detect(url, "^https://vn.") | 
    str_detect(url, "^https://za.")
  ) %>% 
  mutate(
    country = str_sub(url, start = 1, end = 11),
    country = str_remove(country, "https://"),
    country = str_remove(country, "\\.")
  )

## join data
df_en <- 
  df_li_en %>% 
  dplyr::select(
    job_title, 
    employer, 
    location,
    size,
    description,
    seniority,
    employment_type,
    industry,
    "sector" = job_functions
  ) %>% 
  mutate(
    sector = case_when(
      str_detect(sector, "Business") ~ "Business Services",
      str_detect(sector, "Information Technology") ~ "Information Technology",
      str_detect(sector, "Consume") ~ "Consumer Services",
      str_detect(sector, "Health") ~ "Health Care",
      str_detect(sector, "Educat") ~ "Education",
      str_detect(sector, "Retail") ~ "Retail",
      str_detect(sector, "Insurance") ~ "Insurance",
      str_detect(sector, "Media") ~ "Media",
      str_detect(sector, "Manufact") ~ "Manufacturing",
      str_detect(sector, "Account|Legal") ~ "Accounting & Legal",
      str_detect(sector, "Travel|Tourism") ~ "Travel & Tourism",
      str_detect(sector, "Biotech|Pharma") ~ "Biotech & Pharmaceuticals",
      str_detect(sector, "Restaurant|Bar|Food") ~ "Restaurants, Bars & Food Services",
      str_detect(sector, "Transport|Logistic") ~ "Transportation & Logistics",
      str_detect(sector, "Construct|Repair|Maintenance") ~ "Construction, Repair & Maintenance",
      str_detect(sector, "Finance") ~ "Finance",
      str_detect(sector, "Oil|Gas|Energy|Utilit") ~ "Oil, Gas, Energy & Utilities",
      str_detect(sector, "ArtsEntertain|Recreat") ~ "Arts, Entertainment & Recreation",
      str_detect(sector, "Telecom") ~ "Telecommunications",
      str_detect(sector, "Gov") ~ "Government",
      TRUE ~ NA_character_
    )
  ) %>% 
  full_join(df_gd_join) %>% 
  mutate(
    description = str_replace_all(description, "\\r", " "),
    description = str_replace_all(description, "\\n", " "),
    description = str_replace_all(description, "\\s+", " ")
  ) %>% 
  distinct(job_title, employer, location, .keep_all = TRUE)
  #distinct(job_title, employer, location, description, .keep_all = TRUE) ## description often slightly different (diff: 111 offers)

We use two data sets:

  • Glassdoor data (original) with 908 observations
  • LinkedIn data (original) with 62,095 observations
    • subset for “SEO”: with 2,387 observations
    • subset for “SEO” and English-speaking offers: with 603 observations

The LinkedIn data contain global job offers while the GlassDoor data only jobs from the US. The LinkedIn data including only job offers with the term SEO (or seo) contain 2,387 observations from English-speaking countries (USA, Canada, UK, Australia, Ireland, South Africa) and 552 from the USA and the UK (links starting with www.linkedin.com).

We merged both data sets and kept as many variables as possible, manually creating new variables for both datasets (GlassDoor: seniority and employment type; LinkedIn: sector) based on text matching of job titles and descriptions. We also removed as many duplictaed entries as possible by matching job title, employer and job location. The final worldwide data set contains 3,127 observations.

Because the job offers are collected from all over the world, a lot of foreign terms are included. Thus, we merged the GlassDoor data also with the English subset of the LinkedIn data and kept again as many variables as possible by manually creating new variables for both data sets. The final “All English” data set contains 1,344 observations.

The GlassDoor data are cleaner with regard to job titles and description than the LinkedIn data. Consequently, some plots using the GlassDoor data do a better job so we provide for now both version (the merged “All English” data set and the GlassDoor data set).

Also, the GlassDoor data contain information that are missing from the LinkedIn data such as estimated salary range, rating, employer, industry, and size (no. of employees).

1. Job Title

2. Location

  • Where do companies hire SEOs? (beautiful US Map)

Data

Data English-Speaking

## locations English-speaking countries
df_loc_en <-
  df_en %>% 
  group_by(location) %>% 
  count(sort = T) %>% 
  ungroup()

## add geocodes of cities in English-speaking countires
path <- here::here("proc_data", "geocodes_en.Rds")

if(file.exists(path)){
  ## load geocodes or...
  df_geo_en <- readRDS(path)
}else{
  ## grab and save geocodes
  df_geo_en <- 
    gio_batch_geocode(df_loc_en$location) %>% 
    unnest(response_results, .preserve = response_warnings) %>% 
    dplyr::select(query, formatted_address, location.lat, location.lng) %>% 
    group_by(query) %>% 
    slice(1)
  
  saveRDS(df_geo_en, path)
}

## map data North America
sf_map_us <-
  df_loc_en %>% 
  left_join(df_geo_en, by = c("location" = "query")) %>% 
  filter(
    !is.na(location.lng),
    !str_detect(location, "United Kingdom"),
    !str_detect(location, "South Africa"),
    !str_detect(location, "Australia"),
    !str_detect(location, "Ireland"),
    !str_detect(location, "Canada")
  ) %>% 
  st_as_sf(coords = c("location.lng", "location.lat"), 
           crs = 4326) %>% 
  st_transform(st_crs(sf_states)) %>% 
  st_crop(st_bbox(sf_states))

## long version with 1 row per offer to count spatially
sf_map_us_long <-
  df_loc_en %>% 
  group_by(location) %>% 
  expand(n = seq(1:n)) %>% 
  left_join(df_geo_en, by = c("location" = "query")) %>% 
  filter(
    !is.na(location.lng),
    !str_detect(location, "United Kingdom"),
    !str_detect(location, "South Africa"),
    !str_detect(location, "Australia"),
    !str_detect(location, "Ireland"),
    !str_detect(location, "Canada")
  ) %>% 
  st_as_sf(coords = c("location.lng", "location.lat"), 
           crs = 4326) %>% 
  st_transform(st_crs(sf_states)) %>% 
  st_crop(st_bbox(sf_states))

2.1 Hot Spots (Cities)

North America Bubble Map

2.2 Hot Spots (US States)

Chloropleth

(This map is derived from spatial locations by intersecting cities with state polygons - thus slightly different numbers compared to the hexagonal grid map which uses states as stated by the source.)

Hex Map

3. Company Info

3.2 Revenue

  • How much revenue do hiring companies make?

-> Counts of unique companies per revenue class

  • What type of skill sets are required by high-revenue companies?

I tokenized the description and removed stop words and numbers as well as manually non-sense/non-skill-related words. There might be more but if we keep it we can have a closer look I would say.

df_rev_skills <-   
  df_revenue %>% 
  unnest_tokens(word, description, token = "words") %>% 
  anti_join(stop_words) %>% 
  filter(
    !word %in% c("seo","experience", "skills", "skill", "ability", "requirement", "requirements", "link", "required", "provide", "day", "email", "based", "growth", "recommendations", "employment", "paid", "payment", "key", "gender", "equal", "applicants", "application", "disability", "disabilities", "world", "qualifications", "multiple", "page", "pages", "related", "site", "candidate", "insurance", "company", "agency", "office", "position"),
    !str_detect(word, "[0-9+]")
  ) %>% 
  mutate(word = if_else(word == "teams", "team", word))

set.seed(2020)

cloud_low <-
  df_rev_skills %>% 
  filter(revenue %in% c("< $1M", "$1–5M", "$5–10M", "$10–25M",
                        "$25–50M", "$50–100M")) %>% 
  count(word, sort = T) %>% 
  top_n(75, n) %>% 
  ggplot(aes(label = word, size = n, color = n)) +
    geom_text_wordcloud(
      family = "Montserrat",
      fontface = "bold",
      shape = "square",
      grid_margin = 2.5
    ) +
    rcartocolor::scale_color_carto_c(palette = "Emrld") +
    scale_size_area(max_size = 7) +
    labs(title = "\nCompanies with revenues lower than $100M") +
    theme_minimal()

cloud_high <-
  df_rev_skills %>% 
  filter(revenue %in% c("$100–500M", "$500M–1B",
                        "$1–2B", "$2–5B", "$5–10B", "> $10B")) %>% 
  count(word, sort = T) %>% 
  top_n(75, n) %>% 
  ggplot(aes(label = word, size = n, color = n)) +
    geom_text_wordcloud(
      family = "Montserrat",
      fontface = "bold",
      shape = "square",
      grid_margin = 2.5
    ) +
    rcartocolor::scale_color_carto_c(palette = "Emrld") +
    scale_size_area(max_size = 7) +
    labs(title = "\nCompanies with revenues higher than $100M") +
    theme_minimal()

## vertical allignment
(cloud_low / cloud_high) * 
  theme(plot.title = element_text(family = "Montserrat", size = 14, face = "bold", 
                                  hjust = .5, margin = margin(b = -30)))

5 Job Requirements

5.1 Education

  • What type of degrees are most often required (BS vs MS vs None)?

Bachelors: 38 Masters: 10 Doctorate: 1

  • Do larger companies require a formal education?

  • Which education level is required by high-revenue companies?

5.2 Programming Languages

  • What programming languages are most often required?

I for now use the programming languages listed by the SO yearly survey: JavaScript, HTML/CSS, SQL, Python, Java, Bash/Shell/PowerShell, C#, PHP, C++, TypeScript, C, Ruby, Go, Assembly, Swift, Kotlin, R, VBA, Objective-C, Scala, Rust, Dart, Elixir, Clojure, WebAssembly + Julia

  • What languages are most often required in combination (e.g. Html, CSS)

5.4 Years of Experience

  • What type of SEO jobs require +5 years experience?

–> Simply filter by “5+ years”? Or search for years and extract number? How to evaluate if they use different ways to say the same?

6 Salaries

6.2 Salaries per Location

  • (Where do companies pay the most/least?)

6.3 Salaries for Different Positions

  • (What are the highest/lowest paying SEO positions?)

6.4 Salaries per Job Requirement

  • Do positions with programming languages pay significantly more? TODO

Convert PDFs to PNGs

## Converting page 1 to 1_1_jobs_cat_1.png... done!
## Converting page 1 to 1_1_jobs_tech_adj_1.png... done!
## Converting page 1 to 1_1_jobs_word_1.png... done!
## Converting page 1 to 2_1_map_northamerica_cities_1.png... done!
## Converting page 1 to 2_1_map_states_cities_1.png... done!
## Converting page 1 to 2_2_map_states_chloro_1.png... done!
## Converting page 1 to 2_2_map_states_chloro2_1.png... done!
## Converting page 1 to 2_2_map_states_hex_1.png... done!
## Converting page 1 to 2_3_map_counties_chloro_1.png... done!
## Converting page 1 to 3_1_size_histo_1.png... done!
## Converting page 1 to 3_2_revenue_histo_1.png... done!
## Converting page 1 to 3_2_revenue_words_horizontal_1.png... done!
## Converting page 1 to 3_2_revenue_words_vertical_1.png... done!
## Converting page 1 to 3_3_industry_counts_1.png... done!
## Converting page 1 to 3_3_sector_counts_1.png... done!
## Converting page 1 to 3_4_rating_histo_1.png... done!
## Converting page 1 to 3_4_rating_lolli_1.png... done!
## Converting page 1 to 4_cloud_sequ_1.png... done!
## Converting page 1 to 4_cloud_word_1.png... done!
## Converting page 1 to 5_1_require_edu_histo_1.png... done!
## Converting page 1 to 5_1_require_edu_revenue_angle_1.png... done!
## Converting page 1 to 5_1_require_edu_revenue_dodge_1.png... done!
## Converting page 1 to 5_1_require_edu_revenue_small_1.png... done!
## Converting page 1 to 5_1_require_edu_size_1.png... done!
## Converting page 1 to 5_2_require_prog_1.png... done!
## Converting page 1 to 5_2_require_prog_comb_1.png... done!
## Converting page 1 to 6_1_salary_histo_1.png... done!
## Converting page 1 to 6_5_rating_salary_1.png... done!